Aim: study the distribution of the annotated junction categories for specific ataxia related genes.
Based on the analysis for STAR annotated introns, this report is focused on five causative Mendelian genes for Ataxia: ATXN1, ATXN2, ATXN7, CACNA1A and FXN. The results will be split by Ataxia diagnosis: Control, SCA1, SCA2, SCA6 and FRDA.
# Variables
ataxia_genes <- c("FXN", "ATXN1", "ATXN2", "ATXN7", "CACNA1A")
# Paths
metadata_path <- file.path(main_path, "metadata/metadata.csv")
multiqc_path <- file.path(main_path, "metadata/multiqc_rseqc_read_distribution.txt")
path_to_bam_files <- file.path("/home/grocamora/RytenLab-Research/11-Ataxia_bulk/Zhongbo_Script/all_filtered_BAM/")
path_to_junc_files <- file.path("/home/grocamora/RytenLab-Research/11-Ataxia_bulk/Zhongbo_Script/all_filtered_JUNC/")
# Require software paths
samtools_path <- file.path("/home/grocamora/tools/samtools/bin/")
regtools_path <- file.path("/home/grocamora/tools/regtools/build/")
# Path to the gtf file to annotate the introns and the ENCODE blacklisted region
gtf_ensembl_path <- file.path("/home/grocamora/RytenLab-Research/Additional_files/Homo_sapiens.GRCh38.105.gtf")
gtf_gencode_path <- file.path("/home/grocamora/RytenLab-Research/Additional_files/GENCODE/gencode.v38.annotation.gtf")
blacklist_path = file.path("/home/grocamora/RytenLab-Research/Additional_files/hg38-blacklist.v2.bed")
## Load the metadata
metadata <- readr::read_delim(metadata_path, show_col_types = FALSE) %>%
dplyr::rowwise() %>%
dplyr::mutate(Individual_ID = stringr::str_split(ID_anon, "_", simplify = T)[1]) %>%
dplyr::filter(!(Diagnosis %in% c("CANVAS", "AIFM1")))The first step of the pipeline is to extract the junctions from the studied BAM files. The process is split in two steps:
Download and extraction of BAM files: the files are downloaded from s3://ataxia-bulk-rnaseq/nextflow_first_attemp/Star_2_pass_by_indv/output/STAR/align/BAM_files/ and subfolders. The BAM files are then filtered to extract the information from the relevant genes studied in this report.
Junction extraction: all junctions are extracted using regtools junction extract after sorting and indexing with samtools. A file is created for each BAM file in BED12 format.
dir.create(path_to_junc_files, showWarnings = F)
#doParallel::registerDoParallel(8) # Whether to use multiprocessing
junc_files_df <- foreach(i = seq_len(nrow(metadata)), .packages = c("tidyverse")) %dopar%{
row = metadata[i, ]
sample_name = row$Correct_sample
sample_diagnosis = row$Diagnosis
sample_files_df <- foreach(j = seq_along(ataxia_genes)) %do%{
gene_name <- ataxia_genes[j]
bam_path <- paste0(path_to_bam_files, sample_name, "_", sample_diagnosis, "_", gene_name, ".bam")
sort_path <- paste0(bam_path, ".sort")
junc_path <- paste0(path_to_junc_files, sample_name, "_", sample_diagnosis, "_", gene_name, ".bam.sort.s0.junc")
# Execute the extraction pipeline if the junc file does not exists
if(!file.exists(junc_path)){
# Sort the BAM file (samtools)
system2(command = paste0(samtools_path, "samtools"), args = c(
"sort", bam_path,
"-o", sort_path
))
# Index the BAM file (samtools)
system2(command = paste0(samtools_path, "samtools"), args = c(
"index", sort_path
))
# Extract the juntions (regtools)
system2(command = paste0(regtools_path, "regtools"), args = c(
"junctions extract", sort_path,
"-m 25",
"-M 1000000",
"-s 0",
"-o", junc_path
))
# Remove side products
system2(command = "rm", args = c(sort_path))
system2(command = "rm", args = c(paste0(sort_path, ".bai")))
}
# Return information about the junc file created
return(tibble::tibble(sample_name = sample_name,
gene_name = gene_name,
junc_file_exists = file.exists(junc_path),
junc_file_path = junc_path))
} %>% dplyr::bind_rows()
} %>% dplyr::bind_rows()#doParallel::registerDoParallel(8) # Whether to use multiprocessing
all_junc <- foreach(i = seq_len(nrow(metadata)), .packages = c("tidyverse")) %dopar%{
row = metadata[i, ]
sample_id <- row$ID_anon
sample_name = row$Correct_sample
sample_diagnosis = row$Diagnosis
sample_junctions <- foreach(j = seq_along(ataxia_genes)) %do%{
gene_name <- ataxia_genes[j]
junc_path <- paste0(path_to_junc_files, sample_name, "_", sample_diagnosis, "_", gene_name, ".bam.sort.s0.junc")
# If file not found, return empty tibble
if(!file.exists(junc_path)) return(tibble())
# Read the junction file
junc <- readr::read_table(junc_path, col_names = F, col_types = "cddcdcddcdcc",
progress = F, locale = readr::locale(grouping_mark = ""))
# Transformations of the junctions
junc <- junc %>%
dplyr::select(chr = X1, start = X2, stop = X3, junID = X4, reads = X5, strand = X6, blockSizes = X11 ) %>%
dplyr::mutate(strand = ifelse(strand == "?", "*", strand)) %>%
dplyr::mutate(sample_id = sample_id,
gene_name = gene_name) %>%
tidyr::separate(col = blockSizes, sep = ",", c("blockSizesStart", "blockSizesEnd"), conver = T) %>%
dplyr::mutate(start = start + blockSizesStart + 1, stop = stop - blockSizesEnd) %>%
GenomicRanges::GRanges() %>%
diffloop::rmchr() %>%
tibble::as_tibble() %>%
dplyr::mutate(seqnames = as.character(seqnames),
strand = as.character(strand)) %>%
dplyr::select(-junID, -blockSizesStart, -blockSizesEnd)
return(junc)
} %>% dplyr::bind_rows()
} %>% dplyr::bind_rows()
# Unique ID for each junction and generate the reads table
all_reads_combined <- all_junc %>%
dplyr::group_by(seqnames, start, end) %>%
dplyr::mutate(junID = dplyr::cur_group_id(), .before = seqnames) %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(values_from = reads, names_from = c(sample_id, gene_name)) %>%
replace(is.na(.), 0)Junctions are first filtered: (i) removing all junctions that overlap with the ENCODE blacklisted region and (ii) removing all junctions that are less than 25 bp long. Then, junctions are annotated using dasper function junction_annot and a reference transcriptome (GENCODE v38 in this report).
Additionally, all junctions associated to more than one gene are removed as they are considered ambiguous.
# The annotation process takes some time, so execute only if the output is not found on disk.
gene_specific_annotated_path <- file.path(main_path, "variables/gene_specific_annotated.rds")
if(!file.exists(gene_specific_annotated_path)){
# Convert the junctions to GR object
all_junc_gr <- all_reads_combined %>%
dplyr::select(junID, seqnames, start, end, width, strand) %>%
GenomicRanges::GRanges()
# Filter by ENCODE blacklisted region
encode_blacklist_hg38 <- loadEncodeBlacklist(blacklist_path)
all_junc_gr <- removeEncodeBlacklistRegions(all_junc_gr, encode_blacklist_hg38)
# Filter junctions < 25bp long
all_junc_gr <- all_junc_gr[which(width(all_junc_gr) >= 25), ]
# Load the reference transcriptome. Some reference transcriptomes (i.e. Gencode)
# use the characters "chr" before the seqnames. To bring consistency, the
# "seqlevelsStyle" is set to "NCBI".
TxDb_ref <- dasper:::ref_load(gtf_gencode_path) %>% `seqlevelsStyle<-`("NCBI")
# Annotate the junctions
annot_junc_gr <- all_junc_gr %>% dasper::junction_annot(ref = TxDb_ref)
# Filter ambiguous genes
annot_junc_gr <- annot_junc_gr[which(sapply(annot_junc_gr$gene_id_junction, length) < 2), ]
# Remove unnecesary columns
annot_junc <- annot_junc_gr %>%
tibble::as_tibble() %>%
dplyr::select(junID, seqnames, start, end, width, strand, type)
annot_junc %>% saveRDS(gene_specific_annotated_path)
}else{
annot_junc <- readRDS(gene_specific_annotated_path)
}## [1] "2023-06-05 08:10:58 - Importing gtf/gff3 as a TxDb..."
## [1] "2023-06-05 08:12:27 - Obtaining co-ordinates of annotated exons and junctions..."
## [1] "2023-06-05 08:12:41 - Getting junction annotation using overlapping exons..."
## [1] "2023-06-05 08:12:42 - Tidying junction annotation..."
## [1] "2023-06-05 08:12:42 - Deriving junction categories..."
## [1] "2023-06-05 08:12:42 - done!"
Lastly, we combine the reads and the annotation category into a single table. An example of the final table:
annot_junc_df <- annot_junc %>%
dplyr::left_join(all_reads_combined %>% dplyr::select(-c(seqnames, start, end, width, strand)),
by = "junID") %>%
dplyr::select(-c(seqnames, start, end, width, strand)) %>%
tidyr::pivot_longer(cols = -c(junID, type), names_to = "sample_id", values_to = "count") %>%
dplyr::filter(count > 0) %>%
dplyr::mutate(gene_name = gsub("^.*_", "", sample_id),
sample_id = gsub("_[a-zA-Z1-9]*$", "", sample_id),
.before = count)
annot_junc_df %>%
dplyr::sample_n(10) %>%
kableExtra::kbl(booktabs = T, linesep = "") %>%
kableExtra::kable_classic(full_width = T, "hover", "striped", html_font = "Cambria", font_size = 14) %>%
kableExtra::row_spec(0, bold = T, font_size = 16)| junID | type | sample_id | gene_name | count |
|---|---|---|---|---|
| 570 | annotated | CO17B_Cerebellum | CACNA1A | 54 |
| 1085 | novel_exon_skip | C23B_Frontal | CACNA1A | 6 |
| 1858 | unannotated | CO20B_Frontal | FXN | 2 |
| 486 | novel_acceptor | CO3A_Cerebellum | CACNA1A | 3 |
| 1797 | annotated | C22B_Frontal | ATXN1 | 9 |
| 202 | annotated | CO9A_Cerebellum | ATXN2 | 78 |
| 236 | novel_exon_skip | CO14B_Frontal | ATXN2 | 2 |
| 863 | novel_acceptor | C24B_Cerebellum | CACNA1A | 3 |
| 1281 | unannotated | C19B_Frontal | CACNA1A | 4 |
| 1084 | novel_donor | C25B_Frontal | CACNA1A | 1 |
To better understand the results presented in the report, it is important to consider the number of junctions associated to each gene. The following graph represents the number of junctions associated to each of the studies genes. Each point represent a sample from where the junctions were extracted.
Figure 3.1: Number of junctions associated to each of the studied genes. The low number of junctions for gene FXN explain the inconsistent results found in later sections.
The first plots shown here represent the distributions using all categories from the dasper annotation function.
Across all visualizations, a Wilcoxon signed-rank test between the distributions was executed, and significant results (\(p<0.05\)) are shown in the graphs. The p-values are Bonferroni corrected given the number of tests executed per group.
First, we plot the category percentages by sample. We employ the function plotJunctionCategories. Source.
Figure 4.1: Proportion of junctions by Diagnosis and tissue. Results presented for all junctions found within the studied genes.
Figure 4.2: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN1 gene.
Figure 4.3: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN2 gene.
Figure 4.4: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN7 gene.
Figure 4.5: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the CACNA1A gene.
Figure 4.6: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the FXN gene.
Next, we also represent the proportion of counts (or reads) that correspond to each category when compared against the total number of junction reads. We use the function plotJunctionCountCategories.
Figure 4.7: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Cerebellum brain region.
Figure 4.8: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Frontal Cortex brain region.
Figure 4.9: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Cerebellum brain region.
Figure 4.10: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Frontal Cortex brain region.
Figure 4.11: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Cerebellum brain region.
Figure 4.12: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Frontal Cortex brain region.
Figure 4.13: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Cerebellum brain region.
Figure 4.14: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Frontal Cortex brain region.
Figure 4.15: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Cerebellum brain region.
Figure 4.16: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Frontal Cortex brain region.
Figure 4.17: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Cerebellum brain region.
Figure 4.18: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Frontal Cortex brain region.
Lastly, the same visualizations are presented but using only three categories: Annotated, Partially annotated and Unannotated.
Figure 5.1: Proportion of junctions by Diagnosis and tissue. Results presented for all junctions found within the studied genes.
Figure 5.2: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN1 gene.
Figure 5.3: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN2 gene.
Figure 5.4: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the ATXN7 gene.
Figure 5.5: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the CACNA1A gene.
Figure 5.6: Proportion of junctions by Diagnosis and tissue. Results presented for junctions located in the FXN gene.
Figure 5.7: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Cerebellum brain region.
Figure 5.8: Proportion of counts by disease status. Results presented for junctions located within all studied genes and from samples extracted in the Frontal Cortex brain region.
Figure 5.9: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Cerebellum brain region.
Figure 5.10: Proportion of counts by disease status. Results presented for junctions located in the ATXN1 gene and from samples extracted in the Frontal Cortex brain region.
Figure 5.11: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Cerebellum brain region.
Figure 5.12: Proportion of counts by disease status. Results presented for junctions located in the ATXN2 gene and from samples extracted in the Frontal Cortex brain region.
Figure 5.13: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Cerebellum brain region.
Figure 5.14: Proportion of counts by disease status. Results presented for junctions located in the ATXN7 gene and from samples extracted in the Frontal Cortex brain region.
Figure 5.15: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Cerebellum brain region.
Figure 5.16: Proportion of counts by disease status. Results presented for junctions located in the CACNA1A gene and from samples extracted in the Frontal Cortex brain region.
Figure 5.17: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Cerebellum brain region.
Figure 5.18: Proportion of counts by disease status. Results presented for junctions located in the FXN gene and from samples extracted in the Frontal Cortex brain region.